import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost
import optuna
training_set = pd.read_parquet("data/numerai_training_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]
eras = training_set.era.unique()
NUM_FOLDS = 5
FOLD_SIZE = int(len(eras) / NUM_FOLDS)
EMBAGO_SIZE = 64
# generate splits
splits_df = pd.DataFrame({
'era': eras,
}).set_index("era") # list of tuples of validation and training eras
step_size = (len(eras) - (FOLD_SIZE + 2 * EMBAGO_SIZE)) // (NUM_FOLDS - 1)
for i in range(NUM_FOLDS):
start = i * step_size
end = start + FOLD_SIZE + 2 * EMBAGO_SIZE
validation_eras = eras[(start + EMBAGO_SIZE):(end - EMBAGO_SIZE)]
training_eras = []
if start == 0:
training_eras = eras[end:]
else:
training_eras = eras[:start]
training_eras = np.concatenate([training_eras, eras[end:]])
splits_df['split_{}'.format(i)] = "embargo"
splits_df.loc[validation_eras, 'split_{}'.format(i)] = "validation"
splits_df.loc[training_eras, 'split_{}'.format(i)] = "training"
# display splits with plotly table
splits_df
| split_0 | split_1 | split_2 | split_3 | split_4 | |
|---|---|---|---|---|---|
| era | |||||
| 0001 | embargo | training | training | training | training |
| 0002 | embargo | training | training | training | training |
| 0003 | embargo | training | training | training | training |
| 0004 | embargo | training | training | training | training |
| 0005 | embargo | training | training | training | training |
| ... | ... | ... | ... | ... | ... |
| 0570 | training | training | training | training | embargo |
| 0571 | training | training | training | training | embargo |
| 0572 | training | training | training | training | embargo |
| 0573 | training | training | training | training | embargo |
| 0574 | training | training | training | training | embargo |
574 rows × 5 columns
# plot splits_df with plotly
from plotly.colors import n_colors
# make table 2000px tall
splits = [s for s in splits_df.columns.tolist() if "split" in s]
fig = go.Figure(data=[go.Table(
header=dict(values=["<b>Era<b>"] + ['<b>Split {}<b>'.format(i) for i in range(NUM_FOLDS)]),
cells=dict(values=[eras] + [splits_df['split_{}'.format(i)] for i in range(NUM_FOLDS)],
# colors=['#FF0000', '#00FF00', '#0000FF'],
fill_color = [
'#FF0000' if "embargo" in splits_df['split_{}'.format(i)] else '#FFFFFF'
for i in range(NUM_FOLDS)
],
align='center',
),
)
])
fig.show(renderer='notebook')
from scipy.stats import gmean
# hyper parameter tuning with optuna
def objective(trial):
# suggest hyper parameters to try at each iteration
params = {
"iterations" : trial.suggest_int("iterations", 100, 2000),
"learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
"depth" : trial.suggest_int("depth", 4, 10),
"task_type" : "GPU",
}
all_correlations = []
# loop over each cross validation fold
for split in splits:
train_eras = splits_df.loc[splits_df[split] == "training"].index
validation_eras = splits_df.loc[splits_df[split] == "validation"].index
# creat model with same parameters each fold
model = catboost.CatBoostRegressor(**params)
model.fit(
X=training_set.loc[training_set.era.isin(train_eras)][feature_names],
y=training_set.loc[training_set.era.isin(train_eras)]["target"],
verbose=False
)
# make predictions on validation fold
preds_df = pd.DataFrame(
{
"prediction" : model.predict(
training_set.loc[training_set.era.isin(validation_eras)][feature_names],
verbose=False,
),
"era" : training_set.loc[training_set.era.isin(validation_eras)]["era"],
"target" : training_set.loc[training_set.era.isin(validation_eras)]["target"],
},
index=training_set.loc[training_set.era.isin(validation_eras)].index,
)
# calculate correlation between prediction and target grouped by era
era_correlations = preds_df.groupby("era").apply(
lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
)
# mean accross all eras
mean_correlation = era_correlations.mean()
# append to list of all correlations
all_correlations.append(mean_correlation)
# some splits tend to have higher correlation than others
# geometric mean prevents these splits from skewing the results
geometric_mean_correlation = gmean(all_correlations)
return geometric_mean_correlation
# hyper parameter optimization with random search
study = optuna.create_study(
direction="maximize",
study_name="catboost_hyper_parameter_tuning",
sampler=optuna.samplers.RandomSampler(seed=42),
)
# 20 trials
study.optimize(objective, n_trials=30)
[I 2022-05-02 20:39:18,361] A new study created in memory with name: catboost_hyper_parameter_tuning [I 2022-05-02 20:48:08,244] Trial 0 finished with value: 0.015598954058755022 and parameters: {'iterations': 812, 'learning_rate': 0.6351221010640696, 'depth': 9}. Best is trial 0 with value: 0.015598954058755022. [I 2022-05-02 20:53:24,378] Trial 1 finished with value: 0.03955272838067788 and parameters: {'iterations': 1238, 'learning_rate': 0.00042079886696066364, 'depth': 5}. Best is trial 1 with value: 0.03955272838067788. [I 2022-05-02 20:55:46,873] Trial 2 finished with value: 0.035781989104643266 and parameters: {'iterations': 210, 'learning_rate': 0.29154431891537513, 'depth': 8}. Best is trial 1 with value: 0.03955272838067788. [I 2022-05-02 21:13:46,587] Trial 3 finished with value: 0.04383621612161705 and parameters: {'iterations': 1446, 'learning_rate': 0.00012087541473056971, 'depth': 10}. Best is trial 3 with value: 0.04383621612161705. [I 2022-05-02 21:20:29,875] Trial 4 finished with value: 0.04329294203991884 and parameters: {'iterations': 1682, 'learning_rate': 0.0007068974950624604, 'depth': 5}. Best is trial 3 with value: 0.04383621612161705. [I 2022-05-02 21:24:07,137] Trial 5 finished with value: 0.044841446893135364 and parameters: {'iterations': 448, 'learning_rate': 0.0016480446427978971, 'depth': 7}. Best is trial 5 with value: 0.044841446893135364. [I 2022-05-02 21:32:20,293] Trial 6 finished with value: 0.0488792949514575 and parameters: {'iterations': 921, 'learning_rate': 0.0014618962793704966, 'depth': 8}. Best is trial 6 with value: 0.0488792949514575. [I 2022-05-02 21:34:46,175] Trial 7 finished with value: 0.041903476970997056 and parameters: {'iterations': 365, 'learning_rate': 0.0014742753159914669, 'depth': 6}. Best is trial 6 with value: 0.0488792949514575. [I 2022-05-02 21:38:47,358] Trial 8 finished with value: 0.05022553670821227 and parameters: {'iterations': 966, 'learning_rate': 0.13826232179369857, 'depth': 5}. Best is trial 8 with value: 0.05022553670821227. [I 2022-05-02 21:42:22,492] Trial 9 finished with value: 0.05568186814182655 and parameters: {'iterations': 1077, 'learning_rate': 0.0234238498471129, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 21:46:24,147] Trial 10 finished with value: 0.03764316492949973 and parameters: {'iterations': 1254, 'learning_rate': 0.00048094619675015767, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:05:09,777] Trial 11 finished with value: 0.016769418371694763 and parameters: {'iterations': 1903, 'learning_rate': 0.7286653737491037, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:11:06,046] Trial 12 finished with value: 0.0417021103285943 and parameters: {'iterations': 679, 'learning_rate': 0.00024586032763280086, 'depth': 8}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:17:08,117] Trial 13 finished with value: 0.041413498008434316 and parameters: {'iterations': 936, 'learning_rate': 0.00030771802712506853, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:18:25,014] Trial 14 finished with value: 0.044528835955442506 and parameters: {'iterations': 165, 'learning_rate': 0.43379206974909373, 'depth': 5}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:26:38,172] Trial 15 finished with value: 0.05085371345839203 and parameters: {'iterations': 1359, 'learning_rate': 0.0017654048052495078, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 22:40:25,813] Trial 16 finished with value: 0.04777174383159533 and parameters: {'iterations': 1139, 'learning_rate': 0.0005488047000766049, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:01:31,490] Trial 17 finished with value: 0.018877487093125016 and parameters: {'iterations': 1573, 'learning_rate': 0.5727904470799616, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:06:07,496] Trial 18 finished with value: 0.033435831845056926 and parameters: {'iterations': 1236, 'learning_rate': 0.48696409415208936, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:09:34,691] Trial 19 finished with value: 0.03742721363803709 and parameters: {'iterations': 472, 'learning_rate': 0.00015167330688076205, 'depth': 6}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:20:10,300] Trial 20 finished with value: 0.048779674492417 and parameters: {'iterations': 838, 'learning_rate': 0.0012172958098369967, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:26:14,786] Trial 21 finished with value: 0.04639859105643161 and parameters: {'iterations': 778, 'learning_rate': 0.0013296521457299515, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:28:00,399] Trial 22 finished with value: 0.053253665181782046 and parameters: {'iterations': 367, 'learning_rate': 0.16172900811143134, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:35:51,391] Trial 23 finished with value: 0.047606668662971446 and parameters: {'iterations': 1976, 'learning_rate': 0.12273800987852965, 'depth': 5}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:37:23,990] Trial 24 finished with value: 0.04465505855161698 and parameters: {'iterations': 110, 'learning_rate': 0.18274508859816008, 'depth': 8}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:42:07,021] Trial 25 finished with value: 0.0510479129275434 and parameters: {'iterations': 1485, 'learning_rate': 0.12164139351417062, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:52:31,984] Trial 26 finished with value: 0.04420323540665883 and parameters: {'iterations': 781, 'learning_rate': 0.00029072088906598463, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-02 23:56:55,996] Trial 27 finished with value: 0.04613187748558095 and parameters: {'iterations': 1284, 'learning_rate': 0.00210664860170422, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-03 00:04:20,395] Trial 28 finished with value: 0.05017677899308446 and parameters: {'iterations': 691, 'learning_rate': 0.0019986340778528873, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655. [I 2022-05-03 00:12:59,248] Trial 29 finished with value: 0.028405286388197913 and parameters: {'iterations': 1311, 'learning_rate': 0.35387588647792356, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655.
# best params and values
print(study.best_params)
print(study.best_value)
{'iterations': 1077, 'learning_rate': 0.0234238498471129, 'depth': 4}
0.05568186814182655
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show(renderer = "notebook")
fig = optuna.visualization.plot_param_importances(study)
fig.show(renderer='notebook')